We transform the data to reflect the desired structure for modeling.
diabetes_full <- read.csv("/Users/benabijah/Downloads/diabetes_cohort.csv", header = T) %>%
mutate(BENE_ID = as.character(BENE_ID)) %>%
select(-c(MONTH, YR))
# define the list of other diagnoses
diab_names <- c("Prediabetes","Diabetes mellitus without complication","Diabetes mellitus with complication")
other_diagnoses <- unique(diabetes_full$CCSR_CATEGORY_DESCRIPTION[
!(diabetes_full$CCSR_CATEGORY_DESCRIPTION %in% diab_names)])
# get data for all non-diabetes diagnoses
other_diagnoses_data <- diabetes_full %>%
filter(CCSR_CATEGORY_DESCRIPTION %in% other_diagnoses) %>%
mutate(status = 1) %>%
select(-STATE) %>%
pivot_wider(names_from = CCSR_CATEGORY_DESCRIPTION, values_from = status)
other_diagnoses_data
# get data for all diabetes diagnoses
diabetes_only_data <- diabetes_full %>%
filter(CCSR_CATEGORY_DESCRIPTION %in% diab_names) %>%
mutate(STATE = case_when(
CCSR_CATEGORY_DESCRIPTION == "Prediabetes" ~ 1,
CCSR_CATEGORY_DESCRIPTION == "Diabetes mellitus without complication" ~ 2,
CCSR_CATEGORY_DESCRIPTION == "Diabetes mellitus with complication" ~ 3)) %>%
select(-CCSR_CATEGORY_DESCRIPTION) %>%
arrange(desc(BENE_ID), TIME_SINCE_PREDIAB)
diabetes_only_data
diabetes_transformed <- full_join(
diabetes_only_data, other_diagnoses_data, by = c("BENE_ID","TIME_SINCE_PREDIAB")) %>%
mutate(STATE = ifelse(is.na(STATE), 0, STATE)) %>%
arrange(desc(BENE_ID), TIME_SINCE_PREDIAB)
diabetes_transformed
# now let's update the STATE column...
update_state_iteratively <- function(data) {
repeat {
# store previous state values
prev_state <- data$STATE
# update STATE iteratively for second to last rows
data <- data %>%
group_by(BENE_ID) %>%
mutate(STATE = ifelse(row_number() > 1 & STATE == 0, lag(STATE), STATE)) %>%
ungroup()
# break the loop if no changes were made
if (all(prev_state == data$STATE)) break
}
return(data)
}
diabetes_transformed <- update_state_iteratively(diabetes_transformed)
diabetes_transformed
# update comorbodities
update_diagnoses_iteratively <- function(data, cols) {
data <- data %>%
group_by(BENE_ID) %>%
mutate(across(all_of(cols), ~ ifelse(row_number() == 1 & is.na(.x), 0, .x))) %>% # set first row to 0, if NA
ungroup()
repeat {
prev_data <- data[cols] #store previous state for checking convergence
# perform iterative update until all NAs are filled
data <- data %>%
group_by(BENE_ID) %>%
mutate(across(all_of(cols), ~ ifelse(is.na(.x), lag(.x, default = 0), .x))) %>%
ungroup()
# break loop if no changes are made
if (identical(prev_data, data[cols])) break
}
return(data)
}
diabetes_transformed <- update_diagnoses_iteratively(diabetes_transformed, other_diagnoses)
# filter out STATE=0
diabetes_transformed <- diabetes_transformed %>% filter(STATE != 0)
diabetes_transformed
# merge with data from enrollment file
enrollment_diab <- read.csv("/Users/benabijah/Downloads/enrollment_cohort.csv", header = T) %>%
mutate(BENE_ID = as.character(BENE_ID))
diabetes_transformed <- left_join(diabetes_transformed, enrollment_diab, by = c("BENE_ID"))
diabetes_transformed
write.csv(diabetes_transformed, "~/Library/CloudStorage/Dropbox/Projects/EHR/diab_cohort.csv", row.names = FALSE)
saveRDS(diabetes_transformed, file="~/Library/CloudStorage/Dropbox/Projects/EHR/diab_cohort.rds")